package com.trytech.mongoocrawler.client.parser.jd;

import com.trytech.mongoocrawler.client.common.queue.UrlFetcherEventProducer;
import com.trytech.mongoocrawler.client.entity.JDItem;
import com.trytech.mongoocrawler.client.parser.HtmlParser;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 京东图书详情解析器
 * @author Collin Chiang
 * @date 2017-04-15
 */
public class JDBookDetailParser extends HtmlParser<JDItem> {
    @Override
    public JDItem parse(String html, UrlFetcherEventProducer urlProducer) {
        try {
            JDItem item = new JDItem();
            Element body = getBody(html);
            Element itemInfoEle = body.getElementById("itemInfo");
            Element nameFragmentEle = itemInfoEle.getElementById("name");
            //获取书名
            String  name = nameFragmentEle.getElementsByTag("h1").first().text();
            item.setName(name);
            //获取作者
            String author = nameFragmentEle.getElementById("p-author").text();
            item.setAuthor(author);
            //获取售价
            Element priceEle = itemInfoEle.getElementById("jd-price");
            String price = priceEle.text();
            String regex = "[^0-9.]";
            Pattern pattern = Pattern.compile(regex);
            Matcher matcher = pattern.matcher(price);
            price = matcher.replaceAll("");
            item.setPrice(Float.parseFloat(price));

            Element infoEle = body.getElementById("parameter2");
            Elements liEle = infoEle.getElementsByTag("li");
            Element agentEle = liEle.get(0);
            //获取出版社
            String agent = agentEle.attr("title");
            item.setAgent(agent);
            //获取ISBN
            String isbn = liEle.get(1).attr("title");
            item.setIsbn(isbn);
            //获取语言
            String language = liEle.get(12).attr("title");
            item.setLanguage(language);
            //获取商品编号
            String no = liEle.get(3).attr("title");
            item.setNo(no);
            Element commentEle = body.getElementById("i-comment");
            Element strongEle = commentEle.getElementsByTag("strong").first();
            //获取好评率
            String goodCommentRate = strongEle.text();
            item.setGoodCommentRate(Short.parseShort(goodCommentRate));
            //获取评价数
            Element commTabEle = body.getElementById("detail-tab-comm");
            Element commEle = commTabEle.getElementsByTag("em").first();
            String commentCount = commEle.text();
            commentCount = commentCount.replaceAll("[\\(\\)]","");
            item.setCommentCount(commentCount);
            return item;
        }catch (Exception e){
            return null;
        }
    }
}
